# Load data
songs <- read.csv("songs.csv") %>%
mutate(genre = as.factor(genre))
head(songs[,-1])
str(songs)
## 'data.frame': 6611 obs. of 15 variables:
## $ track_id : chr "2DB4DdfCFMw1iaR6JaR03a" "3uwnnTQcHM1rDqSfA4gQNz" "5WtfUKzXircvW8l5aaVZWT" "3e7sxremeOE3wTySiOhGiP" ...
## $ name : chr "Bam Bam (feat. Ed Sheeran)" "Cool for the Summer" "What A Time (feat. Niall Horan)" "Dusk Till Dawn (feat. Sia) - Radio Edit" ...
## $ artists : chr "Camila Cabello" "Demi Lovato" "Julia Michaels" "ZAYN" ...
## $ genre : Factor w/ 6 levels "EDM","Hip pop",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ danceability : num 0.756 0.583 0.516 0.259 0.573 0.488 0.559 0.694 0.803 0.262 ...
## $ energy : num 0.697 0.61 0.467 0.437 0.739 0.343 0.559 0.712 0.585 0.606 ...
## $ key : int 8 5 0 11 0 4 11 7 8 8 ...
## $ loudness : num -6.38 -5.64 -6.18 -6.59 -5.74 ...
## $ mode : int 1 0 1 0 1 1 1 0 1 1 ...
## $ speechiness : num 0.0401 0.0382 0.0302 0.0386 0.129 0.0436 0.0358 0.046 0.0432 0.0484 ...
## $ acousticness : num 0.182 0.00425 0.662 0.102 0.0285 0.556 0.0145 0.132 0.103 0.247 ...
## $ instrumentalness: num 0.00 1.05e-04 0.00 1.32e-06 0.00 0.00 0.00 0.00 3.94e-06 0.00 ...
## $ liveness : num 0.333 0.14 0.0853 0.106 0.111 0.21 0.138 0.211 0.0644 0.125 ...
## $ valence : num 0.956 0.336 0.386 0.0951 0.451 0.0978 0.338 0.799 0.593 0.275 ...
## $ tempo : num 95 114.1 132.9 180 97.1 ...
# Type of variables
vis_dat(songs)
is.na(songs) %>% colSums()
## track_id name artists genre
## 0 0 0 0
## danceability energy key loudness
## 0 0 0 0
## mode speechiness acousticness instrumentalness
## 0 0 0 0
## liveness valence tempo
## 0 0 0
# Data cleaning
songs_clean <- songs %>%
select(-track_id, -name, -artists)
# mutate(mode = as.factor(mode)) # mode takes only 0 (minor) or 1 (major)
# boxplot of audio features
ggplot(melt(songs_clean[,-c(1,6)]), aes(y = value, fill = variable)) +
geom_boxplot() +
facet_wrap(~variable, scales="free") +
theme(legend.position="none")
# drop outliers in loundness
songs_clean <- songs_clean %>%
filter(loudness > -20)
# Summary
as.data.frame(do.call(cbind, lapply(songs_clean[,-1], summary)))
# Number of songs by genre
songs_clean %>%
count(genre) %>%
kable() %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")
| genre | n |
|---|---|
| EDM | 1038 |
| Hip pop | 1265 |
| Latin | 1027 |
| Pop | 1191 |
| R&B | 923 |
| Rock | 1163 |
ggplot(data = songs_clean, aes(y=genre)) +
geom_bar(aes(fill=genre)) +
theme (plot.title = element_text(hjust = 0.5)) +
labs(title = "Music Genres - Number of Cases and Percentages",
y = "Genres",
x = "Count") +
expand_limits(x = 1450) +
geom_text(stat='count',
aes(label = sprintf('%s (%.1f%%)',
after_stat(count),
after_stat(count / sum(count) * 100))),
hjust=ifelse(1.5, -0.1, 1.1)) +
scale_fill_brewer(palette="Accent")
feature_names <- names(songs_clean)[c(2:12)]
songs_clean %>%
select(-mode) %>%
pivot_longer(cols = feature_names[-5]) %>%
ggplot(aes(x = value, colour = genre)) +
geom_density(alpha = 0.3) +
facet_wrap(~name, ncol = 4, scales = "free") +
theme (plot.title = element_text(hjust = 0.5)) +
labs(title = 'Spotify Audio Feature Density by Genre',
x = '', y = 'Density') +
theme(axis.text.y = element_blank()) +
scale_color_brewer(palette="Accent")
songs_clean %>%
select(-mode) %>%
pivot_longer(cols = feature_names[-5]) %>%
ggplot(aes(y = value, fill = genre)) +
geom_boxplot() +
facet_wrap(~name, ncol = 2, scales = "free") +
theme (plot.title = element_text(hjust = 0.5)) +
labs(title = 'Spotify Audio Feature Boxplot by Genre',
x = '', y = '') +
scale_fill_brewer(palette="Accent")
songs_clean %>%
select(feature_names) %>%
scale() %>%
cor() %>%
ggcorrplot(type = "upper", lab = T) +
labs(title = "Correlation between Audio Features", x = "", y = "") +
theme (plot.title = element_text(hjust = 0.5)) +
theme_gray()
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(feature_names)` instead of `feature_names` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
Energy and loudness are fairly highly correlated (0.65). In this case, loudness will be removed, since energy appears to give more distinction between genre groups (as seen in the density and box plot).
# remove loudness
feature_names_reduced <- feature_names[-4]
songs_clean <- songs_clean %>%
select(genre, feature_names_reduced)
# Create correlation matrix using median value of features
genre_corr <- songs_clean %>%
group_by(genre) %>%
summarise_if(is.numeric, median, na.rm = TRUE) %>%
ungroup() %>%
select(-genre) %>%
scale() %>%
t() %>%
as.matrix() %>%
cor()
colnames(genre_corr) <- levels(songs_clean$genre)
row.names(genre_corr) <- levels(songs_clean$genre)
ggcorrplot(genre_corr, type = "upper", lab = T) +
labs(title = "Correlation between Genres", x = "", y = "") +
theme (plot.title = element_text(hjust = 0.5)) +
theme_gray()
logit <- glm(genre ~., data = songs_clean, family = binomial)
summary(logit)
##
## Call:
## glm(formula = genre ~ ., family = binomial, data = songs_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4815 0.0454 0.1722 0.3864 2.8828
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 11.358811 0.573992 19.789 < 2e-16 ***
## danceability -6.088799 0.398092 -15.295 < 2e-16 ***
## energy -9.776767 0.437868 -22.328 < 2e-16 ***
## key 0.001991 0.013287 0.150 0.8809
## mode 0.650747 0.095390 6.822 8.98e-12 ***
## speechiness 3.205314 0.650251 4.929 8.25e-07 ***
## acousticness 3.606148 0.508640 7.090 1.34e-12 ***
## instrumentalness -3.946038 0.276871 -14.252 < 2e-16 ***
## liveness -0.660562 0.301207 -2.193 0.0283 *
## valence 5.997461 0.264019 22.716 < 2e-16 ***
## tempo -0.014599 0.002018 -7.235 4.66e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5746.0 on 6606 degrees of freedom
## Residual deviance: 3055.5 on 6596 degrees of freedom
## AIC: 3077.5
##
## Number of Fisher Scoring iterations: 7
#stargazer(logit, type = 'html', summary = T, report = "vc*stp", ci=T)
songs_scaled <- songs_clean %>%
mutate_if(is.numeric, scale)
song_cov <- cov(songs_scaled[,feature_names_reduced])
song_eigen <- eigen(song_cov)
data.frame(proporation_of_variance = song_eigen$values/sum(song_eigen$values)) %>%
mutate(cumulative_prop = cumsum(proporation_of_variance),
pc = 1:n()) %>%
ggplot(aes(x = pc, y = cumulative_prop)) +
geom_point() +
geom_line() +
ylim(c(0,1)) +
theme (plot.title = element_text(hjust = 0.5)) +
labs(title = 'Cumulative Scree Plot',
x = 'Principal Component',
y = 'Cumulative % of variance explained')
song_eigenvectors <- song_eigen$vectors[,1:2] * -1
song_eigenvectors <- song_eigenvectors %>%
as.data.frame() %>%
mutate(feature = row.names(song_cov)) %>%
rename('PC1' = 'V1',
'PC2' = 'V2')
song_eigenvectors %>%
pivot_longer(cols = c('PC1', 'PC2')) %>%
ggplot(aes(x = feature, y = value)) +
geom_col(aes(fill = feature), position = 'dodge') +
facet_wrap(~name, ncol = 2) +
coord_flip() +
labs(title = 'Principal Component Loadings', x = 'Feature', y = '') +
theme(legend.position="none", plot.title = element_text(hjust = 0.5))
PC <- data.frame(
genre = songs_scaled$genre,
PC1 = as.matrix(songs_scaled[,feature_names_reduced]) %*% song_eigenvectors[,1],
PC2 = as.matrix(songs_scaled[,feature_names_reduced]) %*% song_eigenvectors[,2])
PC %>%
ggplot(aes(x = PC1, y = PC2, color = genre)) +
geom_point(alpha = 0.6) +
facet_wrap(~genre) +
labs(title = 'Plotting principal components 1 vs 2') +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette="Accent")
set.seed(51211)
songs_clean <- songs_clean %>%
mutate(mode = as.factor(mode)) %>%
mutate_if(is.numeric, scale)
train_index <- sample(1:nrow(songs_clean), nrow(songs_clean)*.75, replace = F)
trainSet <- songs_clean[train_index,]
testSet <- songs_clean[-train_index,]
multi.log <- multinom(genre ~ danceability + energy + mode + speechiness +
acousticness + instrumentalness + valence + tempo,
data = trainSet)
stargazer(multi.log, type = 'html', summary = T, report = "vc*stp", ci=T)
| Dependent variable: | |||||
| Hip pop | Latin | Pop | R&B | Rock | |
| (1) | (2) | (3) | (4) | (5) | |
| danceability | 0.032 | -0.288*** | -0.591*** | -0.630*** | -2.188*** |
| (-0.145, 0.209) | (-0.475, -0.101) | (-0.759, -0.423) | (-0.815, -0.445) | (-2.374, -2.001) | |
| t = 0.354 | t = -3.022 | t = -6.876 | t = -6.668 | t = -22.955 | |
| p = 0.724 | p = 0.003 | p = 0.000 | p = 0.000 | p = 0.000 | |
| energy | -1.929*** | -1.081*** | -1.642*** | -2.557*** | -1.507*** |
| (-2.140, -1.718) | (-1.297, -0.865) | (-1.844, -1.440) | (-2.778, -2.337) | (-1.720, -1.294) | |
| t = -17.932 | t = -9.809 | t = -15.924 | t = -22.729 | t = -13.872 | |
| p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | |
| mode1 | 0.273** | 0.801*** | 0.552*** | 0.016 | 0.826*** |
| (0.013, 0.533) | (0.531, 1.070) | (0.301, 0.802) | (-0.256, 0.288) | (0.560, 1.092) | |
| t = 2.059 | t = 5.828 | t = 4.316 | t = 0.116 | t = 6.088 | |
| p = 0.040 | p = 0.000 | p = 0.00002 | p = 0.908 | p = 0.000 | |
| speechiness | 0.964*** | -0.077 | 0.054 | 0.113 | -1.018*** |
| (0.817, 1.110) | (-0.245, 0.091) | (-0.104, 0.211) | (-0.058, 0.284) | (-1.262, -0.775) | |
| t = 12.909 | t = -0.896 | t = 0.668 | t = 1.297 | t = -8.205 | |
| p = 0.000 | p = 0.371 | p = 0.505 | p = 0.195 | p = 0.000 | |
| acousticness | 0.400*** | 1.432*** | 0.942*** | 0.614*** | 0.450*** |
| (0.123, 0.677) | (1.160, 1.704) | (0.674, 1.209) | (0.340, 0.888) | (0.169, 0.730) | |
| t = 2.832 | t = 10.323 | t = 6.907 | t = 4.393 | t = 3.143 | |
| p = 0.005 | p = 0.000 | p = 0.000 | p = 0.00002 | p = 0.002 | |
| instrumentalness | -1.887*** | -2.760*** | -1.041*** | -1.003*** | -0.257*** |
| (-2.440, -1.335) | (-3.677, -1.843) | (-1.287, -0.795) | (-1.277, -0.730) | (-0.374, -0.140) | |
| t = -6.693 | t = -5.899 | t = -8.302 | t = -7.189 | t = -4.292 | |
| p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | p = 0.00002 | |
| valence | 1.001*** | 1.689*** | 0.938*** | 1.479*** | 1.949*** |
| (0.832, 1.170) | (1.510, 1.868) | (0.775, 1.101) | (1.300, 1.657) | (1.771, 2.127) | |
| t = 11.580 | t = 18.467 | t = 11.271 | t = 16.261 | t = 21.440 | |
| p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | |
| tempo | -0.512*** | -0.253*** | -0.429*** | -0.563*** | -0.503*** |
| (-0.660, -0.364) | (-0.406, -0.100) | (-0.574, -0.284) | (-0.718, -0.408) | (-0.652, -0.354) | |
| t = -6.782 | t = -3.238 | t = -5.804 | t = -7.129 | t = -6.626 | |
| p = 0.000 | p = 0.002 | p = 0.000 | p = 0.000 | p = 0.000 | |
| Constant | 0.687*** | 0.035 | 1.144*** | 0.873*** | 0.018 |
| (0.387, 0.986) | (-0.343, 0.413) | (0.887, 1.401) | (0.605, 1.141) | (-0.279, 0.315) | |
| t = 4.498 | t = 0.181 | t = 8.728 | t = 6.390 | t = 0.116 | |
| p = 0.00001 | p = 0.856 | p = 0.000 | p = 0.000 | p = 0.908 | |
| Akaike Inf. Crit. | 12,108.470 | 12,108.470 | 12,108.470 | 12,108.470 | 12,108.470 |
| Note: | p<0.1; p<0.05; p<0.01 | ||||
mean(predict(multi.log, type="class", newdata=testSet)==testSet$genre)
## [1] 0.5151332
multi.log.ns <-
multinom(genre ~ ns(danceability, df=5) + ns(energy, df=5) + mode +
ns(speechiness, df=5) + ns(acousticness, df=5) + ns(instrumentalness, df=5) +
ns(valence, df=5) + ns(tempo, df=5), data = songs_clean)
stargazer(multi.log.ns, type = 'html', summary = T, report = "vc*stp", ci=T)
| Dependent variable: | |||||
| Hip pop | Latin | Pop | R&B | Rock | |
| (1) | (2) | (3) | (4) | (5) | |
| ns(danceability, df = 5)1 | -1.545 | -1.757* | -1.696** | -2.601*** | -6.091*** |
| (-3.696, 0.607) | (-3.767, 0.254) | (-3.387, -0.005) | (-4.453, -0.748) | (-7.540, -4.641) | |
| t = -1.407 | t = -1.713 | t = -1.966 | t = -2.751 | t = -8.235 | |
| p = 0.160 | p = 0.087 | p = 0.050 | p = 0.006 | p = 0.000 | |
| ns(danceability, df = 5)2 | -1.588 | -2.004* | -2.375** | -3.848*** | -8.828*** |
| (-4.016, 0.841) | (-4.301, 0.294) | (-4.333, -0.418) | (-5.985, -1.711) | (-10.599, -7.056) | |
| t = -1.281 | t = -1.709 | t = -2.379 | t = -3.530 | t = -9.767 | |
| p = 0.201 | p = 0.088 | p = 0.018 | p = 0.0005 | p = 0.000 | |
| ns(danceability, df = 5)3 | 0.320 | 0.443 | -1.360** | -1.741*** | -7.074*** |
| (-1.074, 1.715) | (-0.909, 1.796) | (-2.551, -0.170) | (-3.037, -0.445) | (-8.507, -5.641) | |
| t = 0.450 | t = 0.643 | t = -2.239 | t = -2.634 | t = -9.674 | |
| p = 0.653 | p = 0.521 | p = 0.026 | p = 0.009 | p = 0.000 | |
| ns(danceability, df = 5)4 | -2.880 | -2.870 | -4.604** | -8.107*** | -16.679*** |
| (-7.879, 2.119) | (-7.584, 1.843) | (-8.603, -0.605) | (-12.493, -3.722) | (-20.787, -12.570) | |
| t = -1.129 | t = -1.194 | t = -2.256 | t = -3.623 | t = -7.957 | |
| p = 0.259 | p = 0.233 | p = 0.025 | p = 0.0003 | p = 0.000 | |
| ns(danceability, df = 5)5 | 1.192 | 0.304 | -1.036 | -2.702*** | -12.937*** |
| (-0.290, 2.675) | (-1.253, 1.861) | (-2.502, 0.429) | (-4.313, -1.092) | (-17.196, -8.677) | |
| t = 1.576 | t = 0.383 | t = -1.386 | t = -3.289 | t = -5.953 | |
| p = 0.115 | p = 0.702 | p = 0.166 | p = 0.002 | p = 0.000 | |
| ns(energy, df = 5)1 | 0.689 | 0.663 | -1.818 | -3.034 | -2.782 |
| (-3.759, 5.136) | (-3.737, 5.064) | (-6.069, 2.433) | (-7.291, 1.222) | (-7.078, 1.515) | |
| t = 0.303 | t = 0.295 | t = -0.838 | t = -1.397 | t = -1.269 | |
| p = 0.762 | p = 0.768 | p = 0.402 | p = 0.163 | p = 0.205 | |
| ns(energy, df = 5)2 | -0.573 | 0.289 | -2.411 | -4.231* | -3.334 |
| (-5.340, 4.193) | (-4.413, 4.991) | (-6.947, 2.126) | (-8.798, 0.336) | (-7.920, 1.251) | |
| t = -0.236 | t = 0.121 | t = -1.042 | t = -1.816 | t = -1.425 | |
| p = 0.814 | p = 0.904 | p = 0.298 | p = 0.070 | p = 0.155 | |
| ns(energy, df = 5)3 | -2.607* | -0.618 | -3.477** | -5.947*** | -4.218*** |
| (-5.504, 0.289) | (-3.489, 2.252) | (-6.244, -0.710) | (-8.773, -3.121) | (-7.018, -1.419) | |
| t = -1.764 | t = -0.422 | t = -2.463 | t = -4.124 | t = -2.953 | |
| p = 0.078 | p = 0.673 | p = 0.014 | p = 0.00004 | p = 0.004 | |
| ns(energy, df = 5)4 | 1.536 | 1.727 | -2.965 | -5.404 | -2.780 |
| (-8.125, 11.196) | (-7.817, 11.271) | (-12.186, 6.257) | (-14.697, 3.888) | (-12.097, 6.536) | |
| t = 0.312 | t = 0.355 | t = -0.630 | t = -1.140 | t = -0.585 | |
| p = 0.756 | p = 0.723 | p = 0.529 | p = 0.255 | p = 0.559 | |
| ns(energy, df = 5)5 | -5.544*** | -3.018*** | -6.127*** | -9.464*** | -4.669*** |
| (-7.383, -3.706) | (-4.845, -1.191) | (-7.891, -4.363) | (-11.749, -7.180) | (-6.356, -2.983) | |
| t = -5.911 | t = -3.238 | t = -6.808 | t = -8.121 | t = -5.427 | |
| p = 0.000 | p = 0.002 | p = 0.000 | p = 0.000 | p = 0.00000 | |
| mode1 | 0.299** | 0.855*** | 0.587*** | 0.148 | 0.981*** |
| (0.052, 0.545) | (0.598, 1.112) | (0.353, 0.820) | (-0.105, 0.402) | (0.731, 1.230) | |
| t = 2.375 | t = 6.531 | t = 4.916 | t = 1.146 | t = 7.711 | |
| p = 0.018 | p = 0.000 | p = 0.00000 | p = 0.252 | p = 0.000 | |
| ns(speechiness, df = 5)1 | -0.557 | -3.217*** | -0.763 | -1.328** | -3.397*** |
| (-1.792, 0.678) | (-4.313, -2.121) | (-1.837, 0.310) | (-2.418, -0.239) | (-4.452, -2.343) | |
| t = -0.884 | t = -5.752 | t = -1.393 | t = -2.389 | t = -6.316 | |
| p = 0.377 | p = 0.000 | p = 0.164 | p = 0.017 | p = 0.000 | |
| ns(speechiness, df = 5)2 | -0.253 | -3.383*** | -1.019 | -1.644** | -3.151*** |
| (-1.695, 1.189) | (-4.710, -2.057) | (-2.310, 0.271) | (-2.967, -0.321) | (-4.453, -1.848) | |
| t = -0.344 | t = -5.001 | t = -1.549 | t = -2.435 | t = -4.740 | |
| p = 0.732 | p = 0.00000 | p = 0.122 | p = 0.015 | p = 0.00001 | |
| ns(speechiness, df = 5)3 | 1.046** | -2.643*** | -0.523 | -0.727 | -3.373*** |
| (0.006, 2.086) | (-3.749, -1.538) | (-1.565, 0.518) | (-1.842, 0.388) | (-4.669, -2.076) | |
| t = 1.971 | t = -4.687 | t = -0.985 | t = -1.278 | t = -5.099 | |
| p = 0.049 | p = 0.00001 | p = 0.325 | p = 0.202 | p = 0.00000 | |
| ns(speechiness, df = 5)4 | 0.098 | -7.129*** | -2.850** | -3.690*** | -8.328*** |
| (-2.702, 2.897) | (-9.654, -4.604) | (-5.322, -0.378) | (-6.204, -1.175) | (-10.862, -5.793) | |
| t = 0.068 | t = -5.533 | t = -2.260 | t = -2.876 | t = -6.440 | |
| p = 0.946 | p = 0.00000 | p = 0.024 | p = 0.005 | p = 0.000 | |
| ns(speechiness, df = 5)5 | 2.286*** | -3.408*** | -2.019*** | -2.128*** | -5.646*** |
| (1.129, 3.443) | (-4.783, -2.032) | (-3.334, -0.704) | (-3.544, -0.712) | (-7.667, -3.625) | |
| t = 3.873 | t = -4.857 | t = -3.009 | t = -2.945 | t = -5.475 | |
| p = 0.0002 | p = 0.00001 | p = 0.003 | p = 0.004 | p = 0.00000 | |
| ns(acousticness, df = 5)1 | -0.499* | 0.760** | -0.110 | 0.663** | -0.735** |
| (-1.081, 0.084) | (0.109, 1.411) | (-0.664, 0.444) | (0.011, 1.315) | (-1.331, -0.138) | |
| t = -1.677 | t = 2.287 | t = -0.389 | t = 1.994 | t = -2.415 | |
| p = 0.094 | p = 0.023 | p = 0.698 | p = 0.047 | p = 0.016 | |
| ns(acousticness, df = 5)2 | 0.088 | 1.952*** | 0.992** | 1.408*** | -0.679 |
| (-0.725, 0.902) | (1.092, 2.812) | (0.213, 1.771) | (0.540, 2.276) | (-1.488, 0.131) | |
| t = 0.213 | t = 4.447 | t = 2.496 | t = 3.178 | t = -1.644 | |
| p = 0.832 | p = 0.00001 | p = 0.013 | p = 0.002 | p = 0.101 | |
| ns(acousticness, df = 5)3 | 0.789 | 2.996*** | 0.897 | 1.764* | 0.714 |
| (-1.069, 2.648) | (1.159, 4.832) | (-0.916, 2.709) | (-0.073, 3.601) | (-1.148, 2.577) | |
| t = 0.832 | t = 3.196 | t = 0.970 | t = 1.882 | t = 0.752 | |
| p = 0.406 | p = 0.002 | p = 0.333 | p = 0.060 | p = 0.453 | |
| ns(acousticness, df = 5)4 | 2.313 | 6.312*** | 4.696*** | 4.490*** | 0.575 |
| (-0.934, 5.559) | (3.016, 9.609) | (1.491, 7.901) | (1.202, 7.778) | (-2.631, 3.780) | |
| t = 1.396 | t = 3.753 | t = 2.872 | t = 2.676 | t = 0.352 | |
| p = 0.163 | p = 0.0002 | p = 0.005 | p = 0.008 | p = 0.726 | |
| ns(acousticness, df = 5)5 | 4.053 | 8.559*** | 6.796** | 4.960 | 3.406 |
| (-2.377, 10.484) | (2.170, 14.948) | (0.427, 13.165) | (-1.423, 11.342) | (-2.995, 9.807) | |
| t = 1.235 | t = 2.626 | t = 2.091 | t = 1.523 | t = 1.043 | |
| p = 0.217 | p = 0.009 | p = 0.037 | p = 0.128 | p = 0.298 | |
| ns(instrumentalness, df = 5)1 | 2.706*** | 3.122*** | 2.456*** | 2.409*** | 1.623*** |
| (1.601, 3.811) | (0.906, 5.339) | (1.599, 3.314) | (1.494, 3.325) | (0.783, 2.462) | |
| t = 4.798 | t = 2.761 | t = 5.615 | t = 5.158 | t = 3.789 | |
| p = 0.00001 | p = 0.006 | p = 0.00000 | p = 0.00000 | p = 0.0002 | |
| ns(instrumentalness, df = 5)2 | 0.420 | 1.009 | 0.426 | 1.276*** | 1.541*** |
| (-0.710, 1.549) | (-1.166, 3.185) | (-0.471, 1.323) | (0.330, 2.222) | (0.681, 2.402) | |
| t = 0.728 | t = 0.909 | t = 0.931 | t = 2.643 | t = 3.511 | |
| p = 0.467 | p = 0.364 | p = 0.353 | p = 0.009 | p = 0.0005 | |
| ns(instrumentalness, df = 5)3 | -1.719* | -1.274 | -2.293*** | -0.952 | 0.201 |
| (-3.739, 0.300) | (-4.842, 2.295) | (-3.638, -0.948) | (-2.418, 0.515) | (-0.832, 1.235) | |
| t = -1.668 | t = -0.700 | t = -3.342 | t = -1.271 | t = 0.382 | |
| p = 0.096 | p = 0.485 | p = 0.001 | p = 0.204 | p = 0.703 | |
| ns(instrumentalness, df = 5)4 | -0.317 | -2.104** | 0.781 | 0.750 | 1.451* |
| (-1.910, 1.276) | (-3.983, -0.224) | (-0.672, 2.234) | (-0.727, 2.228) | (-0.002, 2.903) | |
| t = -0.390 | t = -2.194 | t = 1.053 | t = 0.995 | t = 1.957 | |
| p = 0.697 | p = 0.029 | p = 0.293 | p = 0.320 | p = 0.051 | |
| ns(instrumentalness, df = 5)5 | -5.602*** | -10.625** | -2.221*** | -3.478*** | -1.274** |
| (-9.281, -1.922) | (-20.737, -0.514) | (-3.505, -0.936) | (-5.431, -1.525) | (-2.264, -0.284) | |
| t = -2.984 | t = -2.060 | t = -3.389 | t = -3.490 | t = -2.523 | |
| p = 0.003 | p = 0.040 | p = 0.001 | p = 0.0005 | p = 0.012 | |
| ns(valence, df = 5)1 | 0.805* | 2.694*** | 1.543*** | 2.508*** | 3.812*** |
| (-0.018, 1.627) | (1.537, 3.850) | (0.815, 2.271) | (1.654, 3.361) | (3.016, 4.609) | |
| t = 1.918 | t = 4.565 | t = 4.153 | t = 5.759 | t = 9.382 | |
| p = 0.056 | p = 0.00001 | p = 0.00004 | p = 0.000 | p = 0.000 | |
| ns(valence, df = 5)2 | 1.780*** | 4.623*** | 2.218*** | 3.910*** | 5.041*** |
| (0.685, 2.874) | (3.173, 6.074) | (1.217, 3.220) | (2.766, 5.054) | (3.959, 6.123) | |
| t = 3.186 | t = 6.247 | t = 4.343 | t = 6.699 | t = 9.132 | |
| p = 0.002 | p = 0.000 | p = 0.00002 | p = 0.000 | p = 0.000 | |
| ns(valence, df = 5)3 | 2.593*** | 3.997*** | 2.160*** | 4.032*** | 5.857*** |
| (1.707, 3.480) | (2.982, 5.013) | (1.305, 3.015) | (3.122, 4.942) | (4.963, 6.751) | |
| t = 5.732 | t = 7.714 | t = 4.954 | t = 8.679 | t = 12.837 | |
| p = 0.000 | p = 0.000 | p = 0.00000 | p = 0.000 | p = 0.000 | |
| ns(valence, df = 5)4 | 3.192*** | 9.336*** | 4.080*** | 7.466*** | 10.348*** |
| (1.150, 5.233) | (6.546, 12.126) | (2.257, 5.903) | (5.352, 9.581) | (8.369, 12.327) | |
| t = 3.065 | t = 6.559 | t = 4.387 | t = 6.920 | t = 10.247 | |
| p = 0.003 | p = 0.000 | p = 0.00002 | p = 0.000 | p = 0.000 | |
| ns(valence, df = 5)5 | 2.656*** | 5.742*** | 3.328*** | 5.128*** | 7.803*** |
| (1.542, 3.770) | (4.656, 6.829) | (2.252, 4.404) | (4.034, 6.221) | (6.731, 8.874) | |
| t = 4.673 | t = 10.360 | t = 6.063 | t = 9.188 | t = 14.269 | |
| p = 0.00001 | p = 0.000 | p = 0.000 | p = 0.000 | p = 0.000 | |
| ns(tempo, df = 5)1 | 0.878 | 2.584* | 1.363 | 1.302 | 3.620*** |
| (-1.703, 3.460) | (-0.240, 5.409) | (-1.189, 3.915) | (-1.258, 3.861) | (0.961, 6.278) | |
| t = 0.667 | t = 1.794 | t = 1.047 | t = 0.997 | t = 2.669 | |
| p = 0.505 | p = 0.073 | p = 0.296 | p = 0.319 | p = 0.008 | |
| ns(tempo, df = 5)2 | -1.190 | 1.849 | -1.449 | -1.647 | 0.585 |
| (-4.051, 1.670) | (-1.287, 4.984) | (-4.255, 1.357) | (-4.474, 1.181) | (-2.344, 3.514) | |
| t = -0.815 | t = 1.155 | t = -1.012 | t = -1.142 | t = 0.391 | |
| p = 0.415 | p = 0.248 | p = 0.312 | p = 0.254 | p = 0.696 | |
| ns(tempo, df = 5)3 | 1.884** | 2.459*** | 0.673 | 0.237 | 1.529* |
| (0.352, 3.417) | (0.817, 4.100) | (-0.826, 2.173) | (-1.314, 1.788) | (-0.028, 3.087) | |
| t = 2.410 | t = 2.935 | t = 0.880 | t = 0.299 | t = 1.924 | |
| p = 0.016 | p = 0.004 | p = 0.379 | p = 0.765 | p = 0.055 | |
| ns(tempo, df = 5)4 | 3.484 | 11.546*** | 1.876 | 2.161 | 4.243 |
| (-2.421, 9.390) | (5.093, 18.000) | (-3.924, 7.677) | (-3.679, 8.001) | (-1.838, 10.324) | |
| t = 1.156 | t = 3.507 | t = 0.634 | t = 0.725 | t = 1.368 | |
| p = 0.248 | p = 0.0005 | p = 0.527 | p = 0.469 | p = 0.172 | |
| ns(tempo, df = 5)5 | -2.830*** | 1.493 | -0.345 | -0.951 | -1.180 |
| (-4.729, -0.930) | (-0.415, 3.401) | (-2.154, 1.463) | (-2.893, 0.991) | (-3.120, 0.760) | |
| t = -2.920 | t = 1.533 | t = -0.374 | t = -0.960 | t = -1.192 | |
| p = 0.004 | p = 0.126 | p = 0.709 | p = 0.338 | p = 0.234 | |
| Constant | -1.779 | -6.723** | 1.630 | 2.429 | 5.166** |
| (-6.936, 3.378) | (-12.259, -1.187) | (-3.138, 6.398) | (-2.410, 7.268) | (0.389, 9.943) | |
| t = -0.676 | t = -2.380 | t = 0.670 | t = 0.984 | t = 2.119 | |
| p = 0.500 | p = 0.018 | p = 0.503 | p = 0.326 | p = 0.035 | |
| Akaike Inf. Crit. | 14,849.350 | 14,849.350 | 14,849.350 | 14,849.350 | 14,849.350 |
| Note: | p<0.1; p<0.05; p<0.01 | ||||
mean(predict(multi.log.ns, type="class", newdata=testSet)==testSet$genre)
## [1] 0.5623487
plotMultiROC <- function(model) {
pred <- data.frame(predict(model, type="prob", newdata=testSet))
colnames(pred) <- paste(levels(testSet$genre), "_pred_M")
true_lable <- data.frame(dummy(testSet$genre))
colnames(true_lable) <- paste(levels(testSet$genre), "_true")
final <- cbind(true_lable, pred)
roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
pr_df <- plot_pr_data(multi_pr(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
return(ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
geom_path(aes(color=Group)) +
geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette="Accent"))
}
plotMultiROC(multi.log.ns) + labs(title = "Logistic Regression w/ Natrual Spline ROC",
x = "FPR", y = "TPR")
plotCM <- function(model){
return(plot_confusion_matrix(
as_tibble(confusionMatrix(predict(model, type="class", newdata=testSet),
testSet$genre)$table),
target_col = "Reference", prediction_col = "Prediction", counts_col = "n"))
}
plotCM(multi.log.ns)
dt <- rpart(genre ~ danceability + energy + mode + speechiness +
acousticness + instrumentalness + valence + tempo, data = trainSet)
rpart.plot(dt, type = 5, extra = 104,
box.palette = list("#7fc97f", "#beaed4", "#fdc086",
"#ffff99", "#386cb0", "#f0027f"),
leaf.round = 0, fallen.leaves = F, branch = 0.3, under = T,
under.col = 'grey40', main = 'Decision Tree', tweak = 1.2)
mean(predict(dt, type="class", newdata=testSet)==testSet$genre)
## [1] 0.4340194
plotMultiROC(dt) + labs(title = "Decision Tree ROC", x = "FPR", y = "TPR")
plotCM(dt)
rf <- randomForest(as.factor(genre) ~ danceability + energy + mode + speechiness +
acousticness + instrumentalness + valence + tempo, data = trainSet,
ntree = 100, importance = TRUE)
mean(predict(rf, testSet)==testSet$genre)
## [1] 0.5702179
plotMultiROC(rf) + labs(title = "Random Forest ROC", x = "FPR", y = "TPR")
plotCM(rf)
matrix_train <- xgb.DMatrix(data = data.matrix(trainSet[,-1]),
label = as.integer(as.factor(trainSet[,1]))-1
)
matrix_test <- xgb.DMatrix(data = data.matrix(testSet[,-1]),
label = as.integer(as.factor(testSet[,1]))-1
)
xgb_class <- xgboost(data = matrix_train,
nrounds = 50,
verbose = F,
params = list(objective = "multi:softmax",
num_class = 6))
xgb_prob <- xgboost(data = matrix_train,
nrounds = 50,
verbose = F,
params = list(objective = "multi:softprob",
num_class = 6))
mean(predict(xgb_class, matrix_test)+1==as.integer(as.factor(testSet$genre)))
## [1] 0.5671913
pred_prob <- as.data.frame(matrix(predict(xgb_prob, newdata=matrix_test),
nrow=nrow(testSet), byrow = T))
colnames(pred_prob) <- paste(levels(testSet$genre), "_pred_XGB")
true_lable <- data.frame(dummy(testSet$genre))
colnames(true_lable) <- paste(levels(testSet$genre), "_true")
final <- cbind(true_lable, pred_prob)
roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
pr_df <- plot_pr_data(multi_pr(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
geom_path(aes(color=Group)) +
geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette="Accent") +
labs(title = "Gradient Boost ROC", x = "FPR", y = "TPR")
# map predicted class from integer to factor
pred_class <- as.integer(predict(xgb_class, matrix_test) + 1)
map <- mapLevels(x=levels(trainSet$genre))
mapLevels(x=pred_class) <- map
plot_confusion_matrix(as_tibble(confusionMatrix(pred_class, testSet$genre)$table),
target_col = "Reference", prediction_col = "Prediction",
counts_col = "n")
svm.linear <- svm(as.factor(genre) ~ danceability + energy + mode + speechiness +
acousticness + instrumentalness + valence + tempo,
data = trainSet, kernel = "linear", cost = 10)
mean(predict(svm.linear, testSet)==testSet$genre)
## [1] 0.5175545
svm.poly <- svm(as.factor(genre) ~ danceability + energy + mode + speechiness +
acousticness + instrumentalness + valence + tempo,
data = trainSet, kernel = "poly", cost = 10, probability = T)
mean(predict(svm.poly, testSet)==testSet$genre)
## [1] 0.5435835
svm_pred <- predict(svm.poly, testSet, probability = T)
pred_prob <- as.data.frame(attr(svm_pred, "probabilities"))
colnames(pred_prob) <- paste(colnames(pred_prob), "_pred_SVM")
true_lable <- data.frame(dummy(testSet$genre))
colnames(true_lable) <- paste(levels(testSet$genre), "_true")
final <- cbind(true_lable, pred_prob)
roc_df <- plot_roc_data(multi_roc(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
pr_df <- plot_pr_data(multi_pr(final, force_diag = F)) %>%
filter(Group != "Micro", Group != "Macro")
ggplot(roc_df, aes(x=1-Specificity, y=Sensitivity)) +
geom_path(aes(color=Group)) +
geom_segment(aes(x = 0, y = 0, xend = 1, yend = 1), linetype = "dashed") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette="Accent") +
labs(title = "Poly SVM ROC", x = "FPR", y = "TPR")
plotCM(svm.poly)
# importance
dt_imp <- as.data.frame(dt$variable)
dt_imp$feature <- rownames(dt_imp)
# mean decrease in impurity
rf_imp <- as.data.frame(randomForest::importance(rf, 2))
rf_imp$feature <- rownames(rf_imp)
# gain
xgb_imp <- xgb.importance(model = xgb_class) %>%
select(Feature, Gain)
imp <- dplyr::left_join(dt_imp, rf_imp, by = "feature") %>%
dplyr::left_join(xgb_imp, by = c("feature" = "Feature")) %>%
rename('Xgboost' = 'Gain',
'Decision tree' = 'dt$variable',
'Random forest' = 'MeanDecreaseGini')
# scale importance for comparison
imp %>%
mutate_if(is.numeric, scale, center = TRUE) %>%
pivot_longer(cols = c('Xgboost', 'Decision tree', 'Random forest')) %>%
rename('model' = 'name') %>%
ggplot(aes(x = reorder(feature, value, mean, na.rm = TRUE),
y = value, color = model)) +
geom_point(size = 2) +
coord_flip() +
labs(title = 'Scaled Variable Importance by Model',
y = 'Scaled value', x = '') +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_brewer(palette = 'Set2')
data.frame(model = 'Logistic regression', actual = testSet$genre,
predicted = predict(multi.log.ns, testSet, type = "class"),
stringsAsFactors = FALSE) %>%
rbind(data.frame(model = 'Decisoin tree', actual = testSet$genre,
predicted = predict(dt, testSet, type = "class"),
stringsAsFactors = FALSE)) %>%
rbind(data.frame(model = 'Random forest', actual = testSet$genre,
predicted = predict(rf, testSet, type = "class"),
stringsAsFactors = FALSE)) %>%
rbind(data.frame(model = 'Xgboost', actual = testSet$genre,
predicted = pred_class,
stringsAsFactors = FALSE)) %>%
rbind(data.frame(model = 'SVM', actual = testSet$genre,
predicted = predict(svm.poly, testSet),
stringsAsFactors = FALSE)) %>%
count(actual, predicted, model) %>%
mutate(match = ifelse(actual == predicted, TRUE, FALSE)) %>%
group_by(actual, model) %>%
mutate(pct = n/sum(n)) %>%
ungroup() %>%
mutate(label = ifelse(match == TRUE,
paste0(round(pct * 100,1),'%'),
""),
model = factor(model, levels = c('Decisoin tree','Random forest','Xgboost',
'Logistic regression','SVM'))) %>%
ggplot(aes(x = actual,
y = pct,
fill = predicted,
label = label)) +
geom_col(position = 'dodge') +
geom_text(position = position_dodge(width = 1),
cex = 2.75,
hjust = -0.1) +
facet_wrap(~ model, ncol = 3) +
coord_flip() +
labs(title = 'Genre Accuracy by Model',
x= "",
y = 'Percent classified',
fill = "") +
ylim(c(0,.85)) +
theme(panel.grid.major.y = element_blank(), plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Accent")